# -*- coding: utf-8 -*-
"""
    @File Name:链家二手房.py
    @Author: name
    @Mail: name@qq.com
    @Created Time: 2021/7/17 17:33
    @Description: 郑州链家楼盘

"""
import os
import io
import sys
import yaml
import time
import logging
import pymysql
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from ratelimiter import RateLimiter
from logging.handlers import RotatingFileHandler
from urllib.parse import urlparse, ParseResult, urlunparse


# 解决相应结果页面编码问题
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')


class connect_mysql(object):
    # 执行下面的execute_sql方法时会自动执行该初始化方法进行连接数据库
    def __init__(self, config):
        self.config = config
        # 建立连接
        self.conn = pymysql.connect(
            host=self.config['host'],
            port=self.config['port'],
            user=self.config['user'],
            password=self.config['password'],
            db=self.config['database'],
            charset='utf8'
        )
        # 创建游标
        # pymysql.cursors.DictCursor 返回字典列表格式数据
        self.cursor = self.conn.cursor(pymysql.cursors.DictCursor)

    def execute_sql(self, sql):
        """
        执行sql语句，并commit提交
        :param sql:需要执行的sql语句
        :return:
        """
        self.cursor.execute(sql)
        self.conn.commit()

    def execute_many_sql(self, sql, data):
        self.cursor.executemany(sql, data)
        self.conn.commit()

    def get_data(self):
        """
        获得查询数据
        :return: 返回查到的数据
        """
        data = self.cursor.fetchall()
        return data

    def close_oracle(self):
        # 关闭游标
        self.cursor.close()
        # 关闭数据库连接
        self.conn.close()


class MyDaemon(object):
    def __init__(self):
        self.url = "https://zz.lianjia.com/loupan/pg{0}"
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

    @RateLimiter(max_calls=1, period=30)
    def send_request(self, url):
        """链家楼盘"""
        resp = requests.get(url, headers=self.headers)
        if resp.status_code == 200:
            return resp

    def parse_html(self, resp):
        """解析html页面"""
        # 解析url链接中的各个部分
        o = urlparse(resp.url)
        html = resp.text
        bs = BeautifulSoup(html, 'lxml')
        ul = bs.find('ul', class_='resblock-list-wrapper')
        li_list = ul.find_all('li')

        lst = []
        for item in li_list:
            img_link = item.find('a', class_='resblock-img-wrapper')['href']
            # 组合parse到的协议/网络地址，以及目的页面的相对路径
            t = ParseResult(o.scheme, o.netloc, img_link, "", "", "")
            img_link = urlunparse(t)

            text_list = item.find('div', class_='resblock-name')
            title = text_list.find('a', class_='name').text
            resblock_type = text_list.find('span', class_='resblock-type').text
            sale_status = text_list.find('span', class_='sale-status').text

            houseInfo = item.find('div', class_='resblock-location')

            locat_list = houseInfo.find_all('span')
            area = locat_list[0].text
            district = locat_list[1].text
            address_test = houseInfo.find('a').text
            address_link = houseInfo.find('a')['href']
            t = ParseResult(o.scheme, o.netloc, address_link, "", "", "")
            address_link = urlunparse(t)
            resblock_room = item.find('a', class_='resblock-room').text.replace('\n', '')
            resblock_area = item.find('div', class_='resblock-area').text.split(' ')[-1] if item.find('div',
                                                                                                      class_='resblock-area') else ''
            unit_price = item.find('div', class_='main-price').find('span', class_='number').text

            total_price = item.find('div', class_='second').text[2:] if item.find('div', class_='second') else ''

            tag = item.find('div', class_='resblock-tag').text.replace('\n', ',').strip(',')

            lst.append(
                (title, resblock_type, sale_status, area, district, address_test, resblock_room, resblock_area,
                 unit_price, total_price, tag, img_link, address_link))

        time.sleep(1)
        # print(self.page, lst)
        # 保存数据
        self.save_data(lst)

    def save_data(self, data):
        """保存数据"""
        """保存文件 or 数据库都可"""
        insert_sql = "insert into zz_lianjia_loupan (title, resblock_type, sale_status, area, district, address_test, resblock_room,resblock_area, unit_price, total_price, tag, img_link, address_link) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        self.db.execute_many_sql(insert_sql, data)
        print(self.page, '插入完毕')

    def select_data(self):
        """查询数据"""
        sql = "select * from zz_lianjia_loupan"
        self.db.execute_sql(sql)
        res = self.db.get_data()
        return res

    def format_data(self, data):
        """格式化数据"""
        print(data)
        df = pd.DataFrame(data,columns=data[0].keys())
        print(df)
        df.to_excel('../data/lianjia_loupan.xls',index=False)
        logging.info('保存成功')

    def run(self):
        ## 如果没有此文件，请手动配置config 字典
        ## {'mysql': {'host': '', 'port': 3306, 'user': '', 'password': '', 'database': ''},...}
        config_file = open(os.path.dirname(os.path.abspath(__file__)) + '/config.yaml', encoding='utf-8')
        self.config = yaml.safe_load(config_file)
        config_file.close()
        ########### 如有报错 此块注释 #########
        name = 'lianjie_house'
        logging.basicConfig(level=logging.INFO)
        handler = RotatingFileHandler(os.path.dirname(os.path.abspath(__file__)) + '/../logs/%s.log' % name,
                                      maxBytes=134217728, backupCount=7)
        formatter = logging.Formatter('%(asctime)s - %(lineno)d- %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        logging.getLogger().addHandler(handler)
        requests_log = logging.getLogger("requests.packages.urllib3")
        requests_log.setLevel(logging.ERROR)
        ########### 如有报错 此块注释 #########

        logging.warning('启动 [%s]', name)
        logging.warning('主线程 PID [%s]', os.getpid())

        self.db = connect_mysql(self.config['mysql'])

        data = self.select_data()
        if not data:
            # 链家楼盘
            for i in range(1, 95):
                self.page = i
                full_url = self.url.format(i)
                resp = self.send_request(full_url)
                self.parse_html(resp)
        else:
            self.format_data(data)

        self.db.close_oracle()


if __name__ == '__main__':
    my_daemon = MyDaemon()
    my_daemon.run()
